## Callin Switzer ## 29 Nov 2016 ## 8 Dec 2016 Update -- re-digitized pollen and anthers very carefully ## Use cross validation to choose smoothing parameter, using a smoothing spline ## 8 Feb 2017, cleaned up code for submission of paper # use ten -fold cross-validation to decide smoothing parameters or plot noise/resoluation tradeoff
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if(length(new.pkg)) install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
packages <- c("ggplot2", "scales", "multcomp", "plyr", "car", "lme4", "signal")
ipak(packages)
## ggplot2 scales multcomp plyr car lme4 signal
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# set random state
set.seed(12345)
# print session info and time the code was run last
Sys.time()
## [1] "2017-03-15 09:30:23 EDT"
sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X El Capitan 10.11.6
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] signal_0.7-6 lme4_1.1-12 Matrix_1.2-8 car_2.1-4
## [5] plyr_1.8.4 multcomp_1.4-6 TH.data_1.0-8 MASS_7.3-45
## [9] survival_2.40-1 mvtnorm_1.0-5 scales_0.4.1 ggplot2_2.2.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.9 nloptr_1.0.4 tools_3.3.2
## [4] digest_0.6.12 evaluate_0.10 tibble_1.2
## [7] gtable_0.2.0 nlme_3.1-130 lattice_0.20-34
## [10] mgcv_1.8-16 yaml_2.1.14 parallel_3.3.2
## [13] SparseM_1.74 stringr_1.1.0 knitr_1.15.1
## [16] MatrixModels_0.4-1 rprojroot_1.2 grid_3.3.2
## [19] nnet_7.3-12 rmarkdown_1.3 minqa_1.2.4
## [22] magrittr_1.5 backports_1.0.5 codetools_0.2-15
## [25] htmltools_0.3.5 splines_3.3.2 assertthat_0.1
## [28] pbkrtest_0.4-6 colorspace_1.3-2 quantreg_5.29
## [31] sandwich_2.3-4 stringi_1.1.2 lazyeval_0.2.0
## [34] munsell_0.4.3 zoo_1.7-14
dataDirectory = '/Users/callinswitzer/Dropbox/ExperSummer2016/Kalmia/Manuscript/DatasetsSupplemental/'
# read in metadata
dfile <- "LaurelsOnly.csv"
metDat <- read.csv(paste0(dataDirectory, dfile))
metDat <- metDat[metDat$redigitizedFile!= "", ]
# set constants:
fps <- 5000 # frames per second
# read in each .csv file for analysis
# make a list of data frames
digdirect <- "/Users/callinswitzer/Dropbox/ExperSummer2016/Kalmia/Manuscript/DatasetsSupplemental/CleanKalmiaDigitized/"
Note: We don’t report anther stuff in the paper, because it’s basically the same as the pollen stuff.
# not run
newDF <- data.frame()
bestSmthAnth <- numeric()
for(ii in 1:nrow(metDat)){
ddfile <- paste0(digdirect, metDat$redigitizedFile[ii])
dp <- read.csv(ddfile)
# get anther and pollen locations
antherPoll <- data.frame(anthx = dp$pt3_cam1_X, anthy= dp$pt3_cam1_Y,
polx = dp$pt4_cam1_X, poly= dp$pt4_cam1_Y)
# gives only rows where either anth and pollen are complete
antherPoll = antherPoll[ complete.cases(antherPoll[c('anthx')]) |
complete.cases(antherPoll[c('polx')]), ]
# if x value starts to right of screen, reverse points,
# so all x values start on the left part of the screen at 0
if(lm(antherPoll[,1] ~ I(1:length( antherPoll[,1])))$coefficients[2] < 0 ){
antherPoll$anthx <- metDat[ii,'vidWidth'] - antherPoll$anthx
antherPoll$polx <- metDat[ii,'vidWidth'] - antherPoll$polx
}
# visualize groups for cv
anther <- antherPoll[,c("anthx", "anthy") ]
anther <- anther[complete.cases(anther), ]
anther$sampNum <- sample(rep(sample(1:10), length = nrow(anther)))
anther$time1 <- 1:nrow(anther)
newL <- data.frame()
for(spp in seq(0.001, to = 0.49, by = 0.01)){
sumErr <- numeric()
for(cvg in 1:10){
xy <- anther
colnames(xy) <- c("x", "y", "sampeNum", "time1")
xy$y[anther$sampNum == cvg] <- xy$x[anther$sampNum == cvg]<- NA
xy = xy[complete.cases(xy), ]
#spp <- 0.001
smy <- smooth.spline(y = xy$y, x = xy$time1, spar = spp)
smx <- smooth.spline(y = xy$x, x = xy$time1, spar = spp)
# plot(y = smy$y, x = smx$y, type = 'n')
# points(xy$x, xy$y, pch = 20) # actual values
newPredsx = predict(object = smx, x = anther$time1[anther$sampNum == cvg])
newPredsy = predict(object = smy, x = anther$time1[anther$sampNum == cvg])
# points(newPredsx$y, y = newPredsy$y, pch = 20, col = 'red') # predicted gaps
# points(x = anther$anthx[anther$sampNum == cvg],
# y = anther$anthy[anther$sampNum == cvg],col = 'grey') # actual gaps
# legend("bottomright", legend = c("raw data", "predicted gaps", "observed gaps"),
# pch = c(20, 20, 1), col = c('black', 'red', 'grey'))
# add up differences -- predicted - observed
sumdiffs <- data.frame(xObs = anther$anthx[anther$sampNum == cvg], xPred = newPredsx$y,
yObs = anther$anthy[anther$sampNum == cvg], yPred = newPredsy$y)
# find distance between points
sumdiffs$dists = apply(X = sumdiffs, MARGIN = 1, function(x) {
return(sqrt((x[1] - x[2])^2 + (x[3] - x[4])^2))
})
# sum of errors / total number of predicted points
sumErr[cvg] <- sum(sumdiffs$dists) / nrow(sumdiffs)
}
newL <- rbind(newL, c(spp, sum(sumErr) / 10))
}
colnames(newL) <- c("smoothingParameter", "sumOfErrors")
newL
plot(newL)
bestSmthAnth[ii] <- newL$smoothingParameter[which.min(newL$sumOfErrors)]
}
hist(bestSmthAnth)
abline(v = mean(bestSmthAnth))
mean(bestSmthAnth)
median(bestSmthAnth) #,0.291, maybe go with median, since the dist is skewed
### END OF 10-fold CV for anthers
bestSmthPol <- numeric()
for(ii in 1:nrow(metDat)){
ddfile <- paste0(digdirect, metDat$redigitizedFile[ii])
dp <- read.csv(ddfile)
# get anther and pollen locations
antherPoll <- data.frame(anthx = dp$pt3_cam1_X, anthy= dp$pt3_cam1_Y,
polx = dp$pt4_cam1_X, poly= dp$pt4_cam1_Y)
# gives only rows where either anth and pollen are complete
antherPoll = antherPoll[ complete.cases(antherPoll[c('anthx')]) |
complete.cases(antherPoll[c('polx')]), ]
# if x value starts to right of screen, reverse points,
# so all x values start on the left part of the screen at 0
if(lm(antherPoll[,1] ~ I(1:length( antherPoll[,1])))$coefficients[2] < 0 ){
antherPoll$anthx <- metDat[ii,'vidWidth'] - antherPoll$anthx
antherPoll$polx <- metDat[ii,'vidWidth'] - antherPoll$polx
}
# visualize groups for cv
pollen <- antherPoll[,c("polx", "poly") ]
pollen <- pollen[complete.cases(pollen), ]
pollen$sampNum <- sample(rep(sample(1:10), length = nrow(pollen)))
pollen$time1 <- 1:nrow(pollen)
newL <- data.frame()
for(spp in seq(0.001, to = 0.49, by = 0.01)){
sumErr <- numeric()
for(cvg in 1:10){
xy <- pollen
colnames(xy) <- c("x", "y", "sampeNum", "time1")
xy$y[pollen$sampNum == cvg] <- xy$x[pollen$sampNum == cvg]<- NA
xy = xy[complete.cases(xy), ]
#spp <- 0.001
smy <- smooth.spline(y = xy$y, x = xy$time1, spar = spp)
smx <- smooth.spline(y = xy$x, x = xy$time1, spar = spp)
# plot(y = smy$y, x = smx$y, type = 'n')
# points(xy$x, xy$y, pch = 20) # actual values
newPredsx = predict(object = smx, x = pollen$time1[pollen$sampNum == cvg])
newPredsy = predict(object = smy, x = pollen$time1[pollen$sampNum == cvg])
# points(newPredsx$y, y = newPredsy$y, pch = 20, col = 'red') # predicted gaps
# points(x = pollen$polx[pollen$sampNum == cvg],
# y = pollen$poly[pollen$sampNum == cvg],col = 'grey') # actual gaps
# legend("bottomright", legend = c("raw data", "predicted gaps", "observed gaps"),
# pch = c(20, 20, 1), col = c('black', 'red', 'grey'))
# add up differences -- predicted - observed
sumdiffs <- data.frame(xObs = pollen$polx[pollen$sampNum == cvg], xPred = newPredsx$y,
yObs = pollen$poly[pollen$sampNum == cvg], yPred = newPredsy$y)
# find distance between points
sumdiffs$dists = apply(X = sumdiffs, MARGIN = 1, function(x) {
return(sqrt((x[1] - x[2])^2 + (x[3] - x[4])^2))
})
# sum of errors / total number of predicted points
sumErr[cvg] <- sum(sumdiffs$dists) / nrow(sumdiffs)
}
newL <- rbind(newL, c(spp, sum(sumErr) / 10))
}
colnames(newL) <- c("smoothingParameter", "sumOfErrors")
newL
plot(newL)
bestSmthPol[ii] <- newL$smoothingParameter[which.min(newL$sumOfErrors)]
}
hist(bestSmthPol)
abline(v = mean(bestSmthPol)) #
median(bestSmthPol) # ~0.29, go with median, since the dist is skewed
## [1] 0.291
mean(bestSmthPol)# ~0.26
## [1] 0.259125
# Decision: smoothing parameter is 0.29
theme_set(theme_classic())
ggplot(newL, aes(x = smoothingParameter, y = sumOfErrors)) +
geom_point() +
labs(x = "Smoothing Parameter", y = "Average out-of-sample error")
saveDir <- "/Users/callinswitzer/Dropbox/ExperSummer2016/Kalmia/Manuscript/Media/"
ggsave(paste0(saveDir, "outOfSampleErrors_Example.pdf"), width = 4, height = 3)
plot(newL)
# plot undersmoothed vs. oversmoothed
xy <- pollen
colnames(xy) <- c("x", "y", "sampeNum", "time1")
xy = xy[complete.cases(xy), ]
xy$sampeNum <- NULL
xy$trt <- "spar = 0"
spp <- 0.29
smy_low <- data.frame(predict(smooth.spline(y = xy$y, x = xy$time1, spar = spp)))
colnames(smy_low) <- c('time1', 'y')
smx_low <- data.frame(predict(smooth.spline(y = xy$x, x = xy$time1, spar = spp)))
colnames(smx_low) <- c('time1', 'x')
lowDF <- merge(smx_low, smy_low)
lowDF$trt <- "spar = 0.29"
spp <- 0.5
smy_high <- data.frame(predict(smooth.spline(y = xy$y, x = xy$time1, spar = spp)))
colnames(smy_high) <- c('time1', 'y')
smx_high <- data.frame(predict(smooth.spline(y = xy$x, x = xy$time1, spar = spp)))
colnames(smx_high) <- c('time1', 'x')
highDF <- merge(smx_high, smy_high)
highDF$trt <- "spar = 0.5"
longDF <- rbind(xy, lowDF, highDF)
library(viridis)
ggplot(longDF, aes(x = x, y = y, color = trt)) +
geom_path(alpha = 0.5) +
geom_point(aes(shape = trt)) +
xlim(c(270, 280)) +
ylim(c(15, 30)) +
labs(x = "X position (pixels)", y = "Y position (pixels)") +
coord_fixed() +
theme(legend.justification = c(1, 0), legend.position = c(1, .5)) +
scale_color_viridis(name = "Smoothing Parameter", discrete = TRUE,end = 0.9) +
scale_shape_discrete(name = "Smoothing Parameter")
## Warning: Removed 66 rows containing missing values (geom_path).
## Warning: Removed 66 rows containing missing values (geom_point).
ggsave(paste0(saveDir, "SuppFig2_smoothingParameter_Example.pdf"), width = 4, height = 4)
## Warning: Removed 66 rows containing missing values (geom_path).
## Warning: Removed 66 rows containing missing values (geom_point).
spp = 0.29
for(ii in 1:nrow(metDat)){
ddfile <- paste0(digdirect, metDat$redigitizedFile[ii])
dp <- read.csv(ddfile)
# get anther and pollen locations
antherPoll <- data.frame(anthx = dp$pt3_cam1_X, anthy= dp$pt3_cam1_Y,
polx = dp$pt4_cam1_X, poly= dp$pt4_cam1_Y)
# gives only rows where either anth and pollen are complete
antherPoll = antherPoll[ complete.cases(antherPoll[c('anthx')]) |
complete.cases(antherPoll[c('polx')]), ]
# if x value starts to right of screen, reverse points,
# so all x values start on the left part of the screen at 0
if(lm(antherPoll[,1] ~ I(1:length( antherPoll[,1])))$coefficients[2] < 0 ){
antherPoll$anthx <- metDat[ii,'vidWidth'] - antherPoll$anthx
antherPoll$polx <- metDat[ii,'vidWidth'] - antherPoll$polx
}
# visualize groups for cv
pollen <- antherPoll[,c("polx", "poly") ]
pollen <- pollen[complete.cases(pollen), ]
pollen$sampNum <- sample(rep(sample(1:10), length = nrow(pollen)))
pollen$time1 <- 1:nrow(pollen)
xy <- pollen
colnames(xy) <- c("x", "y", "sampeNum", "time1")
xy = xy[complete.cases(xy), ]
smy <- smooth.spline(y = xy$y, x = xy$time1, spar = spp)
smx <- smooth.spline(y = xy$x, x = xy$time1, spar = spp)
plot(y = smy$y, x = smx$y, pch = 20, col = rgb(0,0,0,0.5), asp = 1)
points(xy$x, xy$y, pch = 20, col = rgb(1,0,0, 0.5)) # actual values
legend("bottomright", pch = c(20,20), col = c('black', 'red'), legend = c("predicted", "observed"))
# Sys.sleep(0.5)
}
Not run
spp = 0.29
for(ii in 1:nrow(metDat)){
ddfile <- paste0(digdirect, metDat$redigitizedFile[ii])
dp <- read.csv(ddfile)
# get anther and pollen locations
antherPoll <- data.frame(anthx = dp$pt3_cam1_X, anthy= dp$pt3_cam1_Y,
polx = dp$pt4_cam1_X, poly= dp$pt4_cam1_Y)
# gives only rows where either anth and pollen are complete
antherPoll = antherPoll[ complete.cases(antherPoll[c('anthx')]) |
complete.cases(antherPoll[c('polx')]), ]
# if x value starts to right of screen, reverse points,
# so all x values start on the left part of the screen at 0
if(lm(antherPoll[,1] ~ I(1:length( antherPoll[,1])))$coefficients[2] < 0 ){
antherPoll$anthx <- metDat[ii,'vidWidth'] - antherPoll$anthx
antherPoll$polx <- metDat[ii,'vidWidth'] - antherPoll$polx
}
# visualize groups for cv
anther <- antherPoll[,c("anthx", "anthy") ]
anther <- anther[complete.cases(anther), ]
anther$time1 <- 1:nrow(anther)
xy <- anther
colnames(xy) <- c("x", "y", "time1")
xy = xy[complete.cases(xy), ]
smy <- smooth.spline(y = xy$y, x = xy$time1, spar = spp)
smx <- smooth.spline(y = xy$x, x = xy$time1, spar = spp)
plot(y = smy$y, x = smx$y, pch = 20, col = rgb(0,0,0,0.5), asp = 1)
points(xy$x, xy$y, pch = 20, col = rgb(1,0,0, 0.5)) # actual values
legend("bottomright", pch = c(20,20), col = c('black', 'red'), legend = c("predicted", "observed"))
# Sys.sleep(0.5)
}